
library(googledrive) 
library(purrr)
library(dplyr)
library(data.table)
library(ggplot2)
library(plyr)
#setwd("C:/Users/rcarrill/Box Sync/WHO STEPS/Urine sodium/00.Data input")
### --- DOWNLOAD ALL EXTRACTED SURVEYS INTO A LOCAL FOLDER
# remember to set the working directory to where you want these surveys
# https://community.rstudio.com/t/how-to-download-a-google-drives-contents-based-on-drive-id-or-url/16896
folder_url <- "https://drive.google.com/drive/folders/1dtMYqYKTiY-8PkMqnsPIoS7igGf-7N7i"
folder <- drive_get(as_id(folder_url))
csv_files <- drive_ls(folder, type = "csv")
walk(csv_files$id, ~ drive_download(as_id(.x)))
rm(list = ls())
###############################################################################################


#Comenzar desde aquí si ya están descargados!
### --- LOAD ALL DATA FILES (23 DATASETS) 23 tienen info de creatinina y sodio urinario.
setwd("~/Desktop/Artículos/STEPS/Data/Extracted")
rm(list = ls())
#change wd for Rodrigo


### BRING ALL DATASETS IN THE FOLDER
datasets = list.files(pattern = "*.csv")
for(i in 1:length(datasets)) assign(datasets[i], read.csv(datasets[i]))
rm(datasets,i)
length(unique(ls()))



### GENERATE A LIST WITH ALL DATASETS BROUGHT
datalist <- lapply(ls(), function(x) if (class(get(x)) == "data.frame") get(x))



### APPEND ALL DATASETS BROUGHT FROM THE FOLDER
pooleddata <- plyr::rbind.fill(datalist)
rm(list=setdiff(ls(), c("pooleddata")))
###############################################################################################





### --- NEW VARIABLES / RECODING 
pooleddata$sbp <- (pooleddata$sbp2 + pooleddata$sbp3)/2
pooleddata$dbp <- (pooleddata$dbp2 + pooleddata$dbp3)/2
pooleddata <- subset(pooleddata, select = -c(sbp1, sbp2, sbp3, dbp1, dbp2, dbp3))
pooleddata$bmi  <- (pooleddata$weight/pooleddata$height^2)



  #Salt consumption by all formulas
    #Intersalt
  pooleddata <- pooleddata %>% mutate(estimated_sodium_excretion_intersalt = ifelse(sex == 1, (23.51+ 0.45*u_sodium)-3.09*u_creatinine + 4.16*bmi+0.22*age, 
                                                                                    3.74+(0.33*u_sodium)-2.44*u_creatinine+2.42*bmi+2.34*age -0.03*age^2))
    #Tanaka
    #here i had to convert creatinine back to mg/dl (*0.08842), and height back to cm (*100)
  pooleddata <- pooleddata %>% mutate(estimated_sodium_excretion_tanaka = 21.98 *((u_sodium/(u_creatinine*10/0.08842))*(16.14*(height*100)+14.89*weight-2.04*age-2244.45))^0.392)
  
    #Kawasaki, consider this: Kawasaki equation was developed to estimate 24-h urinary sodium excretion based on the second morning void only.
    #here i had to convert creatinine back to mg/dl (*0.08842), and height back to cm (*100)
  pooleddata <- pooleddata %>% mutate(estimated_sodium_excretion_kawasaki = ifelse(sex == 1, 16.3 *((u_sodium/(u_creatinine*10/0.08842))*(7.39*(height*100)+15.12*weight-12.63*age-79.9))^0.5, 
                                                                                   16.3 *((u_sodium/(u_creatinine*10/0.08842))*(5.09*(height*100)+8.58*weight-4.72*age-74.95))^0.5))
    #Toft
    #here i had to convert creatinine back to mg/dl (*0.08842), and height back to cm (*100)
  pooleddata <- pooleddata %>% mutate(estimated_sodium_excretion_toft = ifelse(sex == 1, 33.56 *((u_sodium/(u_creatinine*10/0.08842))*(-7.54*age+14.15*weight+3.48*(height*100)+423.15))^0.345, 
                                                                               52.65 *((u_sodium/(u_creatinine*10/0.08842))*(-6.13*age+9.97*weight+2.45*(height*100)+342.73))^0.196))
  
  #convert to g of salt/day
  pooleddata$estimated_sodium_excretion_intersalt <- pooleddata$estimated_sodium_excretion_intersalt/17.1 
  pooleddata$estimated_sodium_excretion_tanaka <- pooleddata$estimated_sodium_excretion_tanaka/17.1
  pooleddata$estimated_sodium_excretion_kawasaki <- pooleddata$estimated_sodium_excretion_kawasaki/17.1
  pooleddata$estimated_sodium_excretion_toft <- pooleddata$estimated_sodium_excretion_toft/17.1
  ###############################################################################################
  
  
  
  

### --- EXCLUSION CRITERIA 
# 1. PEOPLE AGED 15-69
pooleddata <- pooleddata[which(pooleddata$age >= 15),]
pooleddata <- pooleddata[which(pooleddata$age <= 69),]



# 2. MISSING WEIGHT OR HEIGHT OR SBP/DBP OR LABS OR PSU/STRATUM/WSTEP3
pooleddata <- pooleddata[which(!is.na(pooleddata$weight)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$height)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$sbp)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$dbp)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$u_creatinine)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$u_sodium)),]

#pooleddata <- pooleddata[which(!is.na(pooleddata$drug_hyper)),]  #Antes usados, ahora para el modelo final ya no
#pooleddata <- pooleddata[which(!is.na(pooleddata$drug_diabetes)),] #Antes usados, ahora para el modelo final ya no
#
pooleddata <- pooleddata[which(!is.na(pooleddata$wstep3)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$psu)),]
pooleddata <- pooleddata[which(!is.na(pooleddata$stratum)),]
#


# 3. IMPLAUSIBLE RANGES WEIGHT, HEIGHT, BMI AND SBP/DBP
pooleddata <- pooleddata[which(pooleddata$weight >= 12),]
pooleddata <- pooleddata[which(pooleddata$weight <= 300),]

pooleddata <- pooleddata[which(pooleddata$height >= 1.00),]
pooleddata <- pooleddata[which(pooleddata$height <= 2.50),]

pooleddata <- pooleddata[which(pooleddata$sbp >= 70),]
pooleddata <- pooleddata[which(pooleddata$sbp <= 270),]

pooleddata <- pooleddata[which(pooleddata$dbp >= 30),]
pooleddata <- pooleddata[which(pooleddata$dbp <= 150),]

pooleddata <- pooleddata[which(pooleddata$bmi >= 10),]
pooleddata <- pooleddata[which(pooleddata$bmi <= 80),]



# 4. PREGNANT 
pooleddata <- pooleddata[which(pooleddata$is_pregnant == 0 | is.na(pooleddata$is_pregnant)),]



# 5. PRESUMABLY CREATININE MEASURED WITH PORTABLE DEVICES (OR OTHER UNITS OF SODIUM)
ggplot(pooleddata, aes(x= age,  y =u_creatinine)) + geom_point() + facet_grid(~ study_id)
ggplot(pooleddata, aes(x= age,  y =u_sodium)) + geom_point() + facet_grid(~ study_id)
pooleddata <- pooleddata[which(!pooleddata$country == "Georgia"),]   #Los resultados no son continuos (son muy fijos), quedamos en no usarlo.
pooleddata <- pooleddata[which(!pooleddata$country == "Afghanistan"),]  #Recomendado por la WHO no usarlo: "results are quite unreliable"


#pooleddata <- pooleddata[which(!pooleddata$country == "Tonga"),] #lo eliminamos momentáneamente cuando usamos DM2 information, porque este no tenía esa info



# 7. IMPLAUSIBLE URINE CREATININE 
pooleddata <- pooleddata[which(pooleddata$u_creatinine >= 1.8),]
pooleddata <- pooleddata[which(pooleddata$u_creatinine <= 28.3 & pooleddata$sex == 2 | 
                               pooleddata$u_creatinine <= 32.7 & pooleddata$sex == 1),]



# 8. NEGATIVE LABS OR SALT CONSUMPTION
pooleddata <- pooleddata[which(!pooleddata$u_creatinine <= 0),]
pooleddata <- pooleddata[which(!pooleddata$u_sodium <= 0),]
pooleddata <- pooleddata[which(!pooleddata$estimated_sodium_excretion_intersalt <= 0),]
pooleddata <- pooleddata[which(!pooleddata$estimated_sodium_excretion_tanaka <= 0),]
pooleddata <- pooleddata[which(!pooleddata$estimated_sodium_excretion_kawasaki <= 0),]
pooleddata <- pooleddata[which(!pooleddata$estimated_sodium_excretion_toft <= 0),]



# 9. EXCLUDE DAILY SALT CONSUMPTION IF BELOW/ABOVE 3 STANDARD DEVIATIONS FROM THE MEAN (FORMULA-SPECIFIC)
upper_intersalt<- mean(pooleddata$estimated_sodium_excretion_intersalt) + 3*sd(pooleddata$estimated_sodium_excretion_intersalt)
lower_intersalt<- mean(pooleddata$estimated_sodium_excretion_intersalt) - 3*sd(pooleddata$estimated_sodium_excretion_intersalt)

upper_tanaka<- mean(pooleddata$estimated_sodium_excretion_tanaka) + 3*sd(pooleddata$estimated_sodium_excretion_tanaka)
lower_tanaka<- mean(pooleddata$estimated_sodium_excretion_tanaka) - 3*sd(pooleddata$estimated_sodium_excretion_tanaka)

upper_kawasaki<- mean(pooleddata$estimated_sodium_excretion_kawasaki) + 3*sd(pooleddata$estimated_sodium_excretion_kawasaki)
lower_kawasaki<- mean(pooleddata$estimated_sodium_excretion_kawasaki) - 3*sd(pooleddata$estimated_sodium_excretion_kawasaki)

upper_toft<- mean(pooleddata$estimated_sodium_excretion_toft) + 3*sd(pooleddata$estimated_sodium_excretion_toft)
lower_toft<- mean(pooleddata$estimated_sodium_excretion_toft) - 3*sd(pooleddata$estimated_sodium_excretion_toft)


pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_intersalt <= upper_intersalt),]
pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_intersalt >= lower_intersalt),]

pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_tanaka <= upper_tanaka),]
pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_tanaka >= lower_tanaka),]

pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_kawasaki <= upper_kawasaki),]
pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_kawasaki >= lower_kawasaki),]

pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_toft <= upper_toft),]
pooleddata <- pooleddata[which(pooleddata$estimated_sodium_excretion_toft >= lower_toft),]
###############################################################################################





### --- EDA: EXPLORATORY DATA ANALYSIS
summary(pooleddata$age)
summary(pooleddata$weight)
summary(pooleddata$height)
summary(pooleddata$u_creatinine)
summary(pooleddata$u_sodium)
summary(pooleddata$estimated_sodium_excretion_intersalt)
summary(pooleddata$estimated_sodium_excretion_tanaka)
summary(pooleddata$estimated_sodium_excretion_kawasaki)
summary(pooleddata$estimated_sodium_excretion_toft)



table(pooleddata$sex, useNA = c("always"))
#table(pooleddata$self_hyper, useNA = c("always"))



#count negative values
table(sign(pooleddata$estimated_sodium_excretion_intersalt))  
table(sign(pooleddata$estimated_sodium_excretion_tanaka))
table(sign(pooleddata$estimated_sodium_excretion_kawasaki))
table(sign(pooleddata$estimated_sodium_excretion_toft))



summary(pooleddata)
###############################################################################################


### - SAVE ONE COMPLETE DATASET
names(pooleddata)
write.csv(pooleddata[,
              c(1:4, 6:7, 10,14:15,18,19,22:23,25:26, 27:35)],
          paste0("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_pooleddata_", Sys.Date(), ".csv"),
          row.names = F, 
          fileEncoding = 'UTF-8')


### --- SAVE THREE DATASETS (TRAIN, TEST AND VALIDATION)
c1<- floor(nrow(pooleddata)*0.5)
c2 <- ceiling(nrow(pooleddata)*0.3)
c3 <- floor(nrow(pooleddata)*0.2)
nrow(pooleddata) == c1+c2+c3


set.seed(123)
result <- split(pooleddata, sample(rep(1:3, nrow(pooleddata) * c(0.5, 0.3, 0.2))))
names(result) <- paste0('df', seq_along(result))
list2env(result, .GlobalEnv)
nrow(df)+nrow(df)+nrow(df) == nrow(pooleddata)


#Getting sure there are not duplicated rows
proof<-semi_join(df1,df2) #filter rows in df1 that are also in df2, should be 0
proof<-semi_join(df1,df3) #filter rows in df1 that are also in df2, should be 0
proof<-semi_join(df2,df3) #filter rows in df2 that are also in df3, should be 0
proof<-semi_join(df1, pooleddata) # filter rows in df1 that are also in pooleddata, all should be

rm(proof, result)

names(df1)
write.csv(df1[,
                            c(1:4, 6:7, 10,14:15,18,19,22:23,25:26, 27:35)],
          paste0("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_train_", Sys.Date(), ".csv"),
          row.names = F, 
          fileEncoding = 'UTF-8')



names(df2)
write.csv(df2[,
                            c(1:4, 6:7, 10,14:15,18,19,22:23,25:26, 27:35)],
          paste0("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_test_", Sys.Date(), ".csv"),
          row.names = F, 
          fileEncoding = 'UTF-8')




names(df3)
write.csv(df3[,
                            c(1:4, 6:7, 10,14:15,18,19,22:23,25:26, 27:35)],
          paste0("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_validate_", Sys.Date(), ".csv"),
          row.names = F, 
          fileEncoding = 'UTF-8')


#Read data:
x<-read.csv("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_test_2021-11-22.csv")
x<-read.csv("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_validate_2021-11-22.csv")
x<-read.csv("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_train_2021-11-22.csv")

###############################################################################################
data<-read.csv("~/Desktop/Artículos/STEPS/Data/Extraction_urine_sodium_pooleddata_2021-11-22.csv")
sd<- svydesign(id = ~psu, strata = ~stratum, weights =~wstep3, data = data, nest = TRUE)

#Sup table: Weighted distribution of predictors in each survey included in the ML model development.
## all predictors
data<-data.table(data)
other<-data[, list(region=first(region), age_min = min(age), age_max = max(age), 
                   sbp_min = round(min(sbp)), sbp_max = round(max(sbp)), dbp_min = round(min(dbp)), dbp_max = round(max(dbp)),
                   weight_min = min(weight), weight_max = max(weight), height_min= min(height), 
                   height_max =max(height), creatinine_min = min(u_creatinine), creatinine_max = max(u_creatinine), 
                   sodium_min = min(u_sodium), sodium_max = max(u_sodium)),
           by=list(country,study_id, data_year)] 
age<-svyby(~age, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
proportion_men<-svyby(~sex == 1, by = ~country+study_id+data_year, design = sd, FUN = svyciprop, vartype = c('ci'), na.rm = TRUE)
sbp<-svyby(~sbp, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
dbp<-svyby(~dbp, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
weight<-svyby(~weight, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
height<-svyby(~height, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
sodium<-svyby(~u_sodium, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
creatinine<-svyby(~u_creatinine, by = ~country+study_id+data_year, design = sd, FUN = svymean, vartype = c('ci'), na.rm = TRUE)
sample<-data.frame(with(data, table(country)))
sample$country<-as.character(sample$country)
sample<-data.table(sample)


sup<-join_all(list(other, age, proportion_men, sbp, dbp, weight, height, sodium, creatinine), by=c("country", "study_id", "data_year"), type='left')

sup<-select(sup, country, data_year, region, age, age_min, age_max, `sex == 1`, sbp, sbp_min,
            sbp_max, dbp, dbp_min, dbp_max, weight, weight_min, weight_max, height, height_min,
            height_max, u_sodium, sodium_min, sodium_max, u_creatinine, creatinine_min, creatinine_max)
sup<-left_join(sup, sample)
rm(other, age, proportion_men, sbp, dbp, weight, height, sodium, creatinine, sample)

################
sup$age<-round(sup$age, 0)
sup$`sex == 1`<-round((sup$`sex == 1`*100), 1)
sup$sbp<-round(sup$sbp)
sup$dbp<-round(sup$dbp)
sup$weight<-round(sup$weight, 1)
sup$weight_min<-round(sup$weight_min, 1)
sup$weight_max<-round(sup$weight_max, 1)
sup$height<-round(sup$height, 2)
sup$height_min<-round(sup$height_min, 2)
sup$height_max<-round(sup$height_max, 2)
sup$u_sodium<-round(sup$u_sodium, 1)
sup$sodium_min<-round(sup$sodium_min, 1)
sup$sodium_max<-round(sup$sodium_max, 1)
sup$u_creatinine<-round(sup$u_creatinine, 1)
sup$creatinine_min<-round(sup$creatinine_min, 1)
sup$creatinine_max<-round(sup$creatinine_max, 1)

sup$age_range<-paste0(sup$age_min, "-", sup$age_max)
sup$sbp<-paste0(sup$sbp, " (", sup$sbp_min, "-", sup$sbp_max, ")")
sup$dbp<-paste0(sup$dbp, " (", sup$dbp_min, "-", sup$dbp_max, ")")
sup$weight<-paste0(sup$weight, " (", sup$weight_min, "-", sup$weight_max, ")")
sup$height<-paste0(sup$height, " (", sup$height_min, "-", sup$height_max, ")")
sup$u_sodium<-paste0(sup$u_sodium, " (", sup$sodium_min, "-", sup$sodium_max, ")")
sup$u_creatinine<-paste0(sup$u_creatinine, " (", sup$creatinine_min, "-", sup$creatinine_max, ")")

sup<-select(sup,country, data_year, region, Freq, age, age_range, `sex == 1`, sbp, dbp, weight, height,
             u_sodium,  u_creatinine)

names(sup)<-c("Country", "Year", "Region", "Sample size" ,"Mean age (years)", "Age range (years)",
              "Proportion of men (%)", "Mean, minimum and maximum \nvalues of SBP (mmHg)",
              "Mean, minimum and maximum \nvalues of DBP (mmHg)","Mean, minimum and maximum \nvalues of weight (kg)", "Mean, minimum and maximum \nvalues of height (m)", 
              "Mean, minimum and maximum \nvalues of urinary sodium (mmol/L)", "Mean, minimum and maximum \nvalues of urinary creatinine (mmol/L)"
              )

write.csv(sup,
          paste0("~/Desktop/Artículos/STEPS/Supplementary materials/Weighted_distribution_predictors_by_STEPS_", Sys.Date(), ".csv"),
          row.names = F)


##For text:
data<-read.csv("~/Desktop/Artículos/STEPS/Supplementary materials/Weighted_distribution_predictors_by_STEPS_2021-12-01.csv")



